# -*- coding: utf-8 -*-
import csv

import jieba
import pandas as pd


def fcut(juzi):

    sl = jieba.cut(juzi, cut_all=False)
    line = '/'.join(sl)
    wordlist=[]
    with open('stop.txt', 'r+', encoding='utf8') as stop:
        stopword = stop.read().split('\n')
        for key in line.split('/'):
            if not(key.strip() in stopword) and (len(key.strip()) > 1):
                # 去除数字
                key = ''.join([i for i in key if not i.isdigit()])
                wordlist.append(key)

                # 关键词
        return ' '.join(wordlist)


def fenci(filename, newf):
    labels = []
    contents = []

    reader = pd.read_excel(filename)
    for i in range(len(reader)):
        labels.append(reader.label[i])
        content = reader.content[i]
        # 中文分词
        seglist = fcut(content)
        contents.append(seglist)

    with open(newf, 'w', newline = '',encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerow(['label', 'content'])

        for i in range(len(labels)):
            writer.writerow([labels[i], contents[i]])


fenci('train_c.csv', 'train_cc.csv')
fenci('val_c.csv', 'val_cc.csv')
fenci('test_c.xlsx', 'test_cc.csv')
